Return to Main Page

Link to Book

library(tidyverse)
library(gapminder)

\(\color{darkblue}{\textbf{Program}}\)


See Basic R

\(\color{darkblue}{\textbf{Import}}\)


Cheatsheet for Importing Data

Also see tidycensus and SQL


\(\color{dodgerblue}{\textbf{Equity Check}}\)

  • Where does this data come from?
  • Why was this data collected?
  • How was this data generated?
  • Is this data demographically representative?
  • Who is included and who is excluded from this data?
  • Whose voices, lives, and experiences are missing?
  • How much can this data be disaggregated by race, gender, ethnicity, etc.?
  • Are the categories mutually exclusive and fully inclusive?
  • Are there “other” categories and, if so, who does that include?
  • Who stands to benefit from this data?
  • Who might be harmed by the collection or publication of this data?

(See more in Urban Institute’s Do No Harm Guide)


\(\color{dodgerblue}{\textbf{Tibbles}}\)

as_tibble() transforms existing data into a tibble

data %>% 
  as_tibble()


tibble() creates a tibble from new data

tibble(x = 1:5, 
       y = 1, 
       z = x ^ 2 + y)
x y z
1 1 2
2 1 5
3 1 10
4 1 17
5 1 26

\(\color{dodgerblue}{\textbf{Import Data}}\)

\(\color{skyblue}{\textrm{- Basic Files (readr)}}\)

readr (part of tidyverse) covers basics like csv files

  • delim = ";" specifies the non-comma delimiter ("\t" is tab, " " is space)
  • skip = n skips the first n lines
  • comment = "#" drops all lines that start with (e.g.) #
  • col_names = FALSE tells read_csv() not to treat the first row as headings and instead label them from X1 to Xn
  • col_names = c("x", "y", "z") renames the columns as the character vector
  • col_types = cols() reads in the columns as the specified data type
  • na = 999 replaces dataset-specific NA values (e.g., 999) as NA
read_csv("path/to/file.csv")

read_rds("path/to/file.rds")

read_delim("path/to/file.txt", delim = ";")
#Best Practice
csv_data <- read_csv("path/to/file.csv",
                     col_types = cols(w = col_double(),
                                      x = col_date(format = ""),
                                      y = col_character(),
                                      z = col_logical()))


\(\color{skyblue}{\textrm{- Stats (haven)}}\)

haven reads SPSS, Stata, and SAS files

library(haven)

read_sas("path/to/file.sas7bdat")

read_spss("path/to/file.spv")

read_stata("path/to/file.dta")


\(\color{skyblue}{\textrm{- Excel (readxl)}}\)

readxl reads excel files (both .xls and .xlsx)

library(readxl)

read_excel("path/to/file.xlsx")


\(\color{skyblue}{\textrm{- Databases (DBI)}}\)

DBI, along with a database specific backend (e.g. RMySQL, RSQLite, RPostgreSQL etc), runs SQL queries (See SQL)

library(DBI)

con <- "connection to database"

dbGetQuery(con, "
  
  SELECT * 
    FROM table
           
           ") %>% as_tibble()


\(\color{skyblue}{\textrm{- JSON Files (jsonlite)}}\)

jsonlite reads json files


\(\color{skyblue}{\textrm{- XML Files (xml2)}}\)

xml2 reads XML


\(\color{dodgerblue}{\textbf{Export}}\)

write_csv() exports a plain csv

dataset %>% write_csv("path/dataset.csv")


write_rds() preserves formatting as an rds file

dataset %>% write_rds("path/dataset.rds")

\(\color{dodgerblue}{\textbf{Initial Check}}\)

\(\color{skyblue}{\textrm{- Contents}}\)

nrow() returns the number of rows

gapminder %>% nrow()
## [1] 1704


ncol() returns the number of columns

gapminder %>% ncol()
## [1] 6


names() returns a list of all of the columns

gapminder %>% names()
## [1] "country"   "continent" "year"      "lifeExp"   "pop"       "gdpPercap"


distinct() returns a list of all unique values in a specified column

gapminder %>% distinct(continent)
continent
Asia
Europe
Africa
Americas
Oceania


\(\color{skyblue}{\textrm{- Summary Stats}}\)

summary() gives basic stats on numeric columns, but I prefer this custom function

col_summaries <- function(.data){
  
  all_cols <- .data %>% names()
  
  test_col <- function(.test_col){
    test_data <- .data %>% 
      as_tibble() %>%
      rename(test = .test_col) %>% 
      mutate_if(is.logical, as.character)
    
    if(dim(test_data %>% filter(!is.na(test)))[1] == 0){
      test_data %>% 
        summarize(column = .test_col,
                  type = class(test_data$test),
                  missing = sum(is.na(test)),
                  unique_values = NA_real_,
                  example = NA_character_,
                  min = NA_real_,
                  q25 = NA_real_,
                  median = NA_real_,
                  q75 = NA_real_,
                  max = NA_real_)
    } else if(class(test_data$test) %in% c("integer", "numeric")){
      test_data %>% 
        summarize(column = .test_col,
                  type = class(test_data$test),
                  missing = sum(is.na(test)),
                  unique_values = NA_real_,
                  example = NA_character_,
                  min = min(test, na.rm = TRUE),
                  q25 = quantile(test, 0.25, na.rm = TRUE),
                  median = median(test, na.rm = TRUE),
                  q75 = quantile(test, 0.75, na.rm = TRUE),
                  max = max(test, na.rm = TRUE))
    } else {
      test_data %>% 
        summarize(column = .test_col,
                  type = class(test_data$test),
                  missing = sum(is.na(test)),
                  unique_values = n_distinct(test)) %>% 
        bind_cols(test_data %>% 
                    filter(!is.na(test)) %>% 
                    select(test) %>% 
                    head(1) %>% 
                    rename(example = test)) %>% 
        mutate(min = NA_real_,
               q25 = NA_real_,
               median = NA_real_,
               q75 = NA_real_,
               max = NA_real_ )
      } 
    }
  
  map_dfr(all_cols, test_col)
}
gapminder %>% col_summaries()
column type missing unique_values example min q25 median q75 max
country factor 0 142 Afghanistan NA NA NA NA NA
continent factor 0 5 Asia NA NA NA NA NA
year integer 0 NA NA 1952.0000 1965.750 1979.5000 1.993250e+03 2.007000e+03
lifeExp numeric 0 NA NA 23.5990 48.198 60.7125 7.084550e+01 8.260300e+01
pop integer 0 NA NA 60011.0000 2793664.000 7023595.5000 1.958522e+07 1.318683e+09
gdpPercap numeric 0 NA NA 241.1659 1202.060 3531.8470 9.325462e+03 1.135231e+05


\(\color{skyblue}{\textrm{- Missing Values}}\)

na = or recode sets what counts as missing

#When importing
dataframe <- read_csv("datasource.csv", 
                      na = c("", "NA", "UNKNOWN", 999))

#When already read-in
dateframe <- dataframe %>% 
  mutate(char_col = recode(char_col, "NA" = NA_character_),
         num_col = recode(num_col, 999 = NA_integer_))


any(is.na()) checks whether there are ANY missing values

#Overall
any(is.na(gapminder))
## [1] FALSE
#In a specific column
any(is.na(gapminder$lifeExp))
## [1] FALSE


summarize(sum(is.na())) counts the missing values within each column

gapminder %>% 
  summarize(observations = n(),
            missing_lifeExp = sum(is.na(lifeExp)))
observations missing_lifeExp
1704 0


A list of expected values can also check completeness

expected <- c("Africa" = 54, 
              "Americas" = 35, 
              "Asia" = 47,
              "Europe" = 43,
              "Oceania" = 14)

gapminder %>% 
  filter(year == 2007) %>% 
  group_by(continent) %>% 
  summarize(countries_represented = n()) %>% 
  mutate(countries_expected = expected[continent],
         missing_countries = countries_expected - countries_represented)
continent countries_represented countries_expected missing_countries
Africa 52 54 2
Americas 25 35 10
Asia 33 47 14
Europe 30 43 13
Oceania 2 14 12


\(\color{skyblue}{\textrm{- Cross Tabs}}\)

table() creates a cross tab count

gap_07 <- gapminder %>% 
  filter(year==2007) %>%
  mutate(country_size = case_when(pop > 3.121e+07 ~ "large",
                                  pop < 4.508e+06 ~ "small",
                                  TRUE            ~ "average"),
         country_size = factor(country_size, 
                               ordered = TRUE, 
                               levels = c("large", "average", "small")))

table(gap_07$continent, gap_07$country_size)
##           
##            large average small
##   Africa      10      25    17
##   Americas     6      13     6
##   Asia        13      14     6
##   Europe       7      17     6
##   Oceania      0       1     1


\(\color{skyblue}{\textrm{- Frequency Tables}}\)

prop.table() creates a frequency table, but I prefer this custom function for formatting

gap_07 <- gapminder %>% 
  filter(year==2007) %>%
  mutate(country_size = case_when(pop > 3.121e+07 ~ "large",
                                  pop < 4.508e+06 ~ "small",
                                  TRUE            ~ "average"),
         country_size = factor(country_size, 
                               ordered = TRUE, 
                               levels = c("large", "average", "small")))

size_table <- table(gap_07$continent, gap_07$country_size)

props <- function(col1, col2, prop_type){
  cross_tab <- table(col1, col2) 
  
  if(prop_type == "overall"){
    prop.table(cross_tab) %>% 
      as.data.frame() %>% 
      as_tibble() %>% 
      spread(col2, Freq) %>%
      janitor::adorn_totals("col") %>% 
      janitor::adorn_totals("row") %>% 
      as_tibble() %>% 
      mutate_at(vars(-col1), scales::percent, accuracy = 0.1) %>% 
      rename(` ` = col1)
  } else if(prop_type == "row"){
    prop.table(cross_tab, 2) %>% 
      as.data.frame() %>% 
      as_tibble() %>% 
      spread(col2, Freq) %>%
      janitor::adorn_totals("row") %>% 
      as_tibble() %>% 
      mutate_at(vars(-col1), scales::percent, accuracy = 0.1) %>% 
      rename(` ` = col1)
  } else{
    prop.table(cross_tab, 1) %>% 
      as.data.frame() %>% 
      as_tibble() %>% 
      spread(col2, Freq) %>%
      janitor::adorn_totals("col") %>% 
      as_tibble() %>% 
      mutate_at(vars(-col1), scales::percent, accuracy = 0.1) %>% 
      rename(` ` = col1)
  }
}
#Overall Proportions
props(gap_07$continent, gap_07$country_size, "overall")
large average small Total
Africa 7.0% 17.6% 12.0% 36.6%
Americas 4.2% 9.2% 4.2% 17.6%
Asia 9.2% 9.9% 4.2% 23.2%
Europe 4.9% 12.0% 4.2% 21.1%
Oceania 0.0% 0.7% 0.7% 1.4%
Total 25.4% 49.3% 25.4% 100.0%
#Column Proportions
props(gap_07$continent, gap_07$country_size, "col")
large average small Total
Africa 19.2% 48.1% 32.7% 100.0%
Americas 24.0% 52.0% 24.0% 100.0%
Asia 39.4% 42.4% 18.2% 100.0%
Europe 23.3% 56.7% 20.0% 100.0%
Oceania 0.0% 50.0% 50.0% 100.0%
#Row Proportions
props(gap_07$continent, gap_07$country_size, "row")
large average small
Africa 27.8% 35.7% 47.2%
Americas 16.7% 18.6% 16.7%
Asia 36.1% 20.0% 16.7%
Europe 19.4% 24.3% 16.7%
Oceania 0.0% 1.4% 2.8%
Total 100.0% 100.0% 100.0%


\(\color{skyblue}{\textrm{- Correlation Tables}}\)

cor() calculated the correlation between two variables, but this function will compare each variable to all other variables

library(reshape2)

get_pretty_cormap <- function (.data, .numcols) {
  cormat <- round(cor(select(.data, .numcols)), 2)
  
  reorder_cormat <- function(cormat){
    dd <- as.dist((1-cormat)/2)
    hc <- hclust(dd)
    cormat <-cormat[hc$order, hc$order]}
  cormat <- reorder_cormat(cormat)
  
  get_upper_tri <- function(cormat){
    cormat[lower.tri(cormat)]<- NA
    return(cormat)}
  upper_tri <- get_upper_tri(cormat)
  
  melted_cormat <- melt(upper_tri, na.rm = TRUE)
  
  ggplot(melted_cormat, aes(Var2, Var1, fill = value)) +
    geom_tile(color = "white") +
    scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, limit = c(-1,1), 
                         space = "Lab", name = "Pearson\nCorrelation") +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1)) +
    coord_fixed() +
    geom_text(aes(Var2, Var1, label = value), color = "black", size = 4) +
    theme(axis.title.x = element_blank(),
          axis.title.y = element_blank(),
          panel.grid.major = element_blank(),
          panel.border = element_blank(),
          panel.background = element_blank(),
          axis.ticks = element_blank(),
          legend.justification = c(1, 0),
          legend.position = c(0.6, 0.7),   
          legend.direction = "horizontal") +
    guides(fill = guide_colorbar(barwidth = 7, barheight = 1, title.position = "top", title.hjust = 0.5))}
get_pretty_cormap(gapminder, c("year", "lifeExp", "pop", "gdpPercap"))

\(\color{darkblue}{\textbf{Tidy}}\)


Cheatsheet for Tidying Data

Cheatsheet for Strings

Cheatsheet for Factors

Cheatsheet for Lubridates


\(\color{dodgerblue}{\textbf{Tidy Data}}\)


\(\color{skyblue}{\textrm{- Gather}}\)

dataframe %>% 
  gather(-c(cols to keep),
         key = key name, 
         value = value name, 
         factor_key = TRUE)

ratings %>% 
  gather(-Season, 
         key = Episode, 
         value = Rating, 
         factor_key = TRUE)


\(\color{skyblue}{\textrm{- Spread}}\)

dataframe %>% 
  spread(key = col to become new cols, 
         value = col to become values under new cols, 
         fill = value to fill for missing)

roster %>% 
  spread(key = Variable, 
         value = Rating)


\(\color{dodgerblue}{\textbf{Values}}\)

\(\color{skyblue}{\textrm{- Missing}}\)

filter(!is.na()) removes entire rows with missing values in the specified column

gapminder %>% 
  filter(!is.na(lifeExp)) 


replace_na() replaces missing values

gapminder %>% 
  mutate(pop = replace_na(pop, 0))


\(\color{skyblue}{\textrm{- Duplicates}}\)

distinct() keeps unique rows and removes all others that are completely identical

data %>% 
  distinct()


distinct(col, .keep_all = TRUE) (i.e., adding a column as an argument) just looks at the values within that column regardless of whether the other columns are the same (this will keep the first occurrence, so arrange as desired first)

#Most recent entry for each country
gapminder %>% 
  arrange(desc(year)) %>% 
  distinct(country, .keep_all = TRUE) %>% 
  head()
country continent year lifeExp pop gdpPercap
Afghanistan Asia 2007 43.828 31889923 974.5803
Albania Europe 2007 76.423 3600523 5937.0295
Algeria Africa 2007 72.301 33333216 6223.3675
Angola Africa 2007 42.731 12420476 4797.2313
Argentina Americas 2007 75.320 40301927 12779.3796
Australia Oceania 2007 81.235 20434176 34435.3674


\(\color{skyblue}{\textrm{- Outliers}}\)

Causes:

  • Valid measurements that happen to be extreme
  • Variability in measurement
  • Experimental error
  • Data entry error
#Visual Check
boxplot(gap_07$pop)

All of the circles are technically outliers. We know that the two extremes (China and India) are, in fact, correct. If there was a third that was that extreme, it would be an error that should be removed or replaced. Similarly, if there was a negative population, that would also be an error that should be removed or replaced.


#3SD Check
upper_out <- mean(gap_07$pop) + (3 * sd(gap_07$pop))
lower_out <- mean(gap_07$pop) - (3 * sd(gap_07$pop))

gap_07 %>% 
  filter(pop > upper_out | pop < lower_out)
country continent year lifeExp pop gdpPercap country_size
China Asia 2007 72.961 1318683096 4959.115 large
India Asia 2007 64.698 1110396331 2452.210 large

\(\color{dodgerblue}{\textbf{Selecting}}\)

\(\color{skyblue}{\textrm{- Filter (observations)}}\)

filter() selects all observations (rows) with values matching the specification(s)

Code Meaning
== Equal to
!= Not equal to
> Greater than
>= Greater than or equal to
< Less than
<= Less than or equal to
%in% c() Within the character vector
!(. %in% c()) Not within the character vector
is.na() Is NA
!is.na() Is not NA
between(x, y) Is between x and y
, or & And
| Or
gapminder %>% 
  filter(between(year, 2000, 2020),
         !(continent %in% c("Africa", "Americas", "Asia")),
         !is.na(gdpPercap)) %>% 
  filter(lifeExp > 81 |
         gdpPercap > 40000) %>% 
  head()
country continent year lifeExp pop gdpPercap
Australia Oceania 2007 81.235 20434176 34435.37
Iceland Europe 2007 81.757 301931 36180.79
Ireland Europe 2007 78.885 4109086 40676.00
Norway Europe 2002 79.050 4535591 44683.98
Norway Europe 2007 80.196 4627926 49357.19
Switzerland Europe 2007 81.701 7554661 37506.42


\(\color{skyblue}{\textrm{- Select (variables)}}\)

select() selects all variables (columns) called for

  • starts_with("abc") selects columns that begin with “abc”
  • ends_with("xyz") selects columns that end with “xyz”
  • contains("ijk") selects columns that contain “ijk”
  • matches("(.)\\1") selects columns that match a regular expression
  • everything() selects everything else (useful for reorganizing columns)
#Just
gapminder %>% 
  select(country, year, pop) %>% 
  head()
country year pop
Afghanistan 1952 8425333
Afghanistan 1957 9240934
Afghanistan 1962 10267083
Afghanistan 1967 11537966
Afghanistan 1972 13079460
Afghanistan 1977 14880372
#Not
gapminder %>% 
  select(-c(continent, pop)) %>% 
  head()
country year lifeExp gdpPercap
Afghanistan 1952 28.801 779.4453
Afghanistan 1957 30.332 820.8530
Afghanistan 1962 31.997 853.1007
Afghanistan 1967 34.020 836.1971
Afghanistan 1972 36.088 739.9811
Afghanistan 1977 38.438 786.1134
#Range from this to that
gapminder %>% 
  select(continent:pop) %>% 
  head()
continent year lifeExp pop
Asia 1952 28.801 8425333
Asia 1957 30.332 9240934
Asia 1962 31.997 10267083
Asia 1967 34.020 11537966
Asia 1972 36.088 13079460
Asia 1977 38.438 14880372

\(\color{dodgerblue}{\textbf{Columns}}\)

\(\color{skyblue}{\textrm{- Renaming}}\)

rename(new name = old name) renames the column

gapminder %>%
  rename(population = pop) %>% 
  head()
country continent year lifeExp population gdpPercap
Afghanistan Asia 1952 28.801 8425333 779.4453
Afghanistan Asia 1957 30.332 9240934 820.8530
Afghanistan Asia 1962 31.997 10267083 853.1007
Afghanistan Asia 1967 34.020 11537966 836.1971
Afghanistan Asia 1972 36.088 13079460 739.9811
Afghanistan Asia 1977 38.438 14880372 786.1134


\(\color{skyblue}{\textrm{- Separating}}\)

separate() splits each value within a column based on some separator

gapminder %>% 
  separate(col = lifeExp, 
           into = c("Years", "Months"), 
           sep = "[.]") %>%
  mutate(Months = as.double(str_c("0.", Months)) * 12) %>% 
  head()
country continent year Years Months pop gdpPercap
Afghanistan Asia 1952 28 9.612 8425333 779.4453
Afghanistan Asia 1957 30 3.984 9240934 820.8530
Afghanistan Asia 1962 31 11.964 10267083 853.1007
Afghanistan Asia 1967 34 0.240 11537966 836.1971
Afghanistan Asia 1972 36 1.056 13079460 739.9811
Afghanistan Asia 1977 38 5.256 14880372 786.1134


\(\color{skyblue}{\textrm{- Uniting}}\)

unite() merges the values of two columns into a new column, placing some separator between the values

gapminder %>% 
  unite(col = location, 
        country, continent, 
        sep = ", ") %>%
  head()
location year lifeExp pop gdpPercap
Afghanistan, Asia 1952 28.801 8425333 779.4453
Afghanistan, Asia 1957 30.332 9240934 820.8530
Afghanistan, Asia 1962 31.997 10267083 853.1007
Afghanistan, Asia 1967 34.020 11537966 836.1971
Afghanistan, Asia 1972 36.088 13079460 739.9811
Afghanistan, Asia 1977 38.438 14880372 786.1134

\(\color{dodgerblue}{\textbf{Mutating}}\)

mutate() adds columns to the existing dataframe

gapminder %>% 
  mutate(gdp = gdpPercap * as.double(pop)) %>% 
  head()
country continent year lifeExp pop gdpPercap gdp
Afghanistan Asia 1952 28.801 8425333 779.4453 6567086330
Afghanistan Asia 1957 30.332 9240934 820.8530 7585448670
Afghanistan Asia 1962 31.997 10267083 853.1007 8758855797
Afghanistan Asia 1967 34.020 11537966 836.1971 9648014150
Afghanistan Asia 1972 36.088 13079460 739.9811 9678553274
Afghanistan Asia 1977 38.438 14880372 786.1134 11697659231


transmute() adds columns and then only selects the specified columns

gapminder %>% 
  transmute(country, 
            year, 
            gdp = gdpPercap * as.double(pop)) %>% 
  head()
country year gdp
Afghanistan 1952 6567086330
Afghanistan 1957 7585448670
Afghanistan 1962 8758855797
Afghanistan 1967 9648014150
Afghanistan 1972 9678553274
Afghanistan 1977 11697659231


\(\color{skyblue}{\textrm{- Recasting}}\)

  • as.character()
  • as.double()
  • as.integer()
  • as.logical()
gapminder %>% 
  mutate(pop = as.double(pop)) %>% 
  head()
country continent year lifeExp pop gdpPercap
Afghanistan Asia 1952 28.801 8425333 779.4453
Afghanistan Asia 1957 30.332 9240934 820.8530
Afghanistan Asia 1962 31.997 10267083 853.1007
Afghanistan Asia 1967 34.020 11537966 836.1971
Afghanistan Asia 1972 36.088 13079460 739.9811
Afghanistan Asia 1977 38.438 14880372 786.1134


\(\color{skyblue}{\textrm{- Factoring}}\)

factor() recategorizes strings as categorical data; ordered = TRUE orders the categories by the levels = c() argument (otherwise it will be alphabetical)

gapminder %>% 
  mutate(country = factor(country),
         continent = factor(continent, 
                            ordered = TRUE,
                            levels = c("Asia", "Americas", "Europe", 
                                       "Africa", "Oceania"))) %>% 
  head()
country continent year lifeExp pop gdpPercap
Afghanistan Asia 1952 28.801 8425333 779.4453
Afghanistan Asia 1957 30.332 9240934 820.8530
Afghanistan Asia 1962 31.997 10267083 853.1007
Afghanistan Asia 1967 34.020 11537966 836.1971
Afghanistan Asia 1972 36.088 13079460 739.9811
Afghanistan Asia 1977 38.438 14880372 786.1134


\(\color{skyblue}{\textrm{- Cases}}\)

case_when() creates a new field of categorical data (that still needs to be factored)

 gapminder %>% 
  filter(year == 2007) %>%
  mutate(country_size = case_when(pop > 3.121e+07 ~ "Large",
                                  pop < 4.508e+06 ~ "Small",
                                  TRUE            ~ "Average"),
         country_size = factor(country_size,
                               ordered = TRUE,
                               levels = c("Small", "Average", "Large"))) %>% 
  head()
country continent year lifeExp pop gdpPercap country_size
Afghanistan Asia 2007 43.828 31889923 974.5803 Large
Albania Europe 2007 76.423 3600523 5937.0295 Small
Algeria Africa 2007 72.301 33333216 6223.3675 Large
Angola Africa 2007 42.731 12420476 4797.2313 Average
Argentina Americas 2007 75.320 40301927 12779.3796 Large
Australia Oceania 2007 81.235 20434176 34435.3674 Average


\(\color{skyblue}{\textrm{- Recoding Values}}\)

recode(old value = new value) replaces all occurrences of the old value with the new value

 gapminder %>% 
  mutate(continent = recode(continent, 
                            "Asia" = "Asia & Pacific Islands",
                            "Oceania" = "Australia & New Zealand")) %>% 
  head()
country continent year lifeExp pop gdpPercap
Afghanistan Asia & Pacific Islands 1952 28.801 8425333 779.4453
Afghanistan Asia & Pacific Islands 1957 30.332 9240934 820.8530
Afghanistan Asia & Pacific Islands 1962 31.997 10267083 853.1007
Afghanistan Asia & Pacific Islands 1967 34.020 11537966 836.1971
Afghanistan Asia & Pacific Islands 1972 36.088 13079460 739.9811
Afghanistan Asia & Pacific Islands 1977 38.438 14880372 786.1134


A lookup table recodes all values faster:

library(hflights)

hflights %>% 
  count(UniqueCarrier) %>% 
  arrange(desc(n)) %>% 
  head()
UniqueCarrier n
XE 73053
CO 70032
WN 45343
OO 16061
MQ 4648
US 4082
airline_lookup_table <- c("AA" = "American",
                          "AS" = "Alaska",
                          "B6" = "JetBlue",
                          "CO" = "Continental",
                          "DL" = "Delta",
                          "OO" = "SkyWest",
                          "UA" = "United",
                          "US" = "US_Airways", 
                          "WN" = "Southwest", 
                          "EV" = "Atlantic_Southeast", 
                          "F9" = "Frontier", 
                          "FL" = "AirTran", 
                          "MQ" = "American_Eagle", 
                          "XE" = "ExpressJet", 
                          "YV" = "Mesa")

hflights %>% 
  mutate(UniqueCarrier = airline_lookup_table[UniqueCarrier]) %>%
  count(UniqueCarrier) %>% 
  arrange(desc(n)) %>% 
  head()
UniqueCarrier n
ExpressJet 73053
Continental 70032
Southwest 45343
SkyWest 16061
American_Eagle 4648
US_Airways 4082


\(\color{skyblue}{\textrm{- Leading/Lagging Values}}\)

First order with arrange() and arrange(desc()), then lead() and lag() pull the leading or lagging value

gapminder %>% 
  filter(country == "United States",
         year > 1990) %>% 
  arrange(year) %>% 
  transmute(year,
            gdpPercap,
            prior_gdpPercap = lag(gdpPercap),
            next_gdpPercap = lead(gdpPercap))
year gdpPercap prior_gdpPercap next_gdpPercap
1992 32003.93 NA 35767.43
1997 35767.43 32003.93 39097.10
2002 39097.10 35767.43 42951.65
2007 42951.65 39097.10 NA


\(\color{skyblue}{\textrm{- Rolling Analysis}}\)

First order with arrange() and arrange(desc()), then cum_() does a rolling analysis

gapminder %>% 
  filter(country == "United States",
         year > 1990) %>% 
  arrange(year) %>% 
  transmute(year,
            gdpPercap,
            rolling_sum = cumsum(gdpPercap), 
            rolling_mean = cummean(gdpPercap), 
            min_to_date = cummin(gdpPercap),
            max_to_date = cummax(gdpPercap))
year gdpPercap rolling_sum rolling_mean min_to_date max_to_date
1992 32003.93 32003.93 32003.93 32003.93 32003.93
1997 35767.43 67771.37 33885.68 32003.93 35767.43
2002 39097.10 106868.46 35622.82 32003.93 39097.10
2007 42951.65 149820.12 37455.03 32003.93 42951.65


\(\color{skyblue}{\textrm{- Whole Column Analysis}}\)

Calling a column by name analyzes the whole column

gapminder %>% 
  filter(year == 2007) %>% 
  transmute(country,
            continent,
            lifeExp,
            world_lifeExp = mean(lifeExp, na.rm = TRUE),
            above_world_lifeExp = lifeExp > world_lifeExp,
            pop) %>% 
  arrange(desc(pop)) %>% 
  head()
country continent lifeExp world_lifeExp above_world_lifeExp pop
China Asia 72.961 67.00742 TRUE 1318683096
India Asia 64.698 67.00742 FALSE 1110396331
United States Americas 78.242 67.00742 TRUE 301139947
Indonesia Asia 70.650 67.00742 TRUE 223547000
Brazil Americas 72.390 67.00742 TRUE 190010647
Pakistan Asia 65.483 67.00742 FALSE 169270617


\(\color{skyblue}{\textrm{- Rankings}}\)

  • ntile() categorizes the data into n even groups
  • rank() ranks the column
gapminder %>% 
  filter(year == 2007) %>% 
  transmute(country,
            continent,
            lifeExp,
            lifeExpQuantile = ntile(desc(lifeExp), 4),
            lifeExpRank = rank(desc(lifeExp)),
            pop) %>% 
  arrange(desc(pop)) %>% 
  head()
country continent lifeExp lifeExpQuantile lifeExpRank pop
China Asia 72.961 2 59 1318683096
India Asia 64.698 3 91 1110396331
United States Americas 78.242 1 30 301139947
Indonesia Asia 70.650 3 80 223547000
Brazil Americas 72.390 2 68 190010647
Pakistan Asia 65.483 3 89 169270617


\(\color{skyblue}{\textrm{- Totals}}\)

The janitor package can adorn_totals()

#Row total
gapminder %>% 
  filter(continent == "Oceania",
         year > 1990) %>% 
  select(country, year, pop) %>% 
  spread(year, pop) %>% 
  janitor::adorn_totals("col") %>% 
  as_tibble()
country 1992 1997 2002 2007 Total
Australia 17481977 18565243 19546792 20434176 76028188
New Zealand 3437674 3676187 3908037 4115771 15137669
#Column total
gapminder %>% 
  filter(continent == "Oceania",
         year > 1990) %>% 
  select(country, year, pop) %>% 
  spread(year, pop) %>% 
  janitor::adorn_totals("row") %>% 
  as_tibble()
country 1992 1997 2002 2007
Australia 17481977 18565243 19546792 20434176
New Zealand 3437674 3676187 3908037 4115771
Total 20919651 22241430 23454829 24549947
#Both totals
gapminder %>% 
  filter(continent == "Oceania",
         year > 1990) %>% 
  select(country, year, pop) %>% 
  spread(year, pop) %>% 
  janitor::adorn_totals("row") %>%
  janitor::adorn_totals("col") %>%
  as_tibble()
country 1992 1997 2002 2007 Total
Australia 17481977 18565243 19546792 20434176 76028188
New Zealand 3437674 3676187 3908037 4115771 15137669
Total 20919651 22241430 23454829 24549947 91165857


\(\color{skyblue}{\textrm{- Grouped Analysis}}\)

group_by can perform any analysis within a group instead of over the whole column

gapminder %>% 
  filter(year == 2007) %>%
  transmute(country,
            continent,
            lifeExp,
            overall_quartile = ntile(desc(lifeExp), 4)) %>% 
  group_by(continent) %>% 
  mutate(continent_quantile = ntile(desc(lifeExp), 4)) %>% 
  ungroup() %>% 
  arrange(continent) %>% 
  head()
country continent lifeExp overall_quartile continent_quantile
Algeria Africa 72.301 2 1
Angola Africa 42.731 4 4
Benin Africa 56.728 4 2
Botswana Africa 50.728 4 3
Burkina Faso Africa 52.295 4 3
Burundi Africa 49.580 4 3


\(\color{skyblue}{\textrm{- Multiple Columns}}\)

mutate_all, mutate_at, and mutate_if analyze multiple columns simultaneously with the same function

#All 
gapminder %>% 
  mutate_all(as.character) %>% 
  head()
country continent year lifeExp pop gdpPercap
Afghanistan Asia 1952 28.801 8425333 779.4453145
Afghanistan Asia 1957 30.332 9240934 820.8530296
Afghanistan Asia 1962 31.997 10267083 853.10071
Afghanistan Asia 1967 34.02 11537966 836.1971382
Afghanistan Asia 1972 36.088 13079460 739.9811058
Afghanistan Asia 1977 38.438 14880372 786.11336
#Just
gapminder %>% 
  mutate_at(vars(lifeExp, gdpPercap), round, 1) %>% 
  head()
country continent year lifeExp pop gdpPercap
Afghanistan Asia 1952 28.8 8425333 779.4
Afghanistan Asia 1957 30.3 9240934 820.9
Afghanistan Asia 1962 32.0 10267083 853.1
Afghanistan Asia 1967 34.0 11537966 836.2
Afghanistan Asia 1972 36.1 13079460 740.0
Afghanistan Asia 1977 38.4 14880372 786.1
#All but
gapminder %>% 
  mutate_at(vars(-country, -continent), as.double) %>% 
  head()
country continent year lifeExp pop gdpPercap
Afghanistan Asia 1952 28.801 8425333 779.4453
Afghanistan Asia 1957 30.332 9240934 820.8530
Afghanistan Asia 1962 31.997 10267083 853.1007
Afghanistan Asia 1967 34.020 11537966 836.1971
Afghanistan Asia 1972 36.088 13079460 739.9811
Afghanistan Asia 1977 38.438 14880372 786.1134
#Like
gapminder %>% 
  mutate_at(vars(matches("gdp")), round, 1) %>% 
  head()
country continent year lifeExp pop gdpPercap
Afghanistan Asia 1952 28.801 8425333 779.4
Afghanistan Asia 1957 30.332 9240934 820.9
Afghanistan Asia 1962 31.997 10267083 853.1
Afghanistan Asia 1967 34.020 11537966 836.2
Afghanistan Asia 1972 36.088 13079460 740.0
Afghanistan Asia 1977 38.438 14880372 786.1
#If
gapminder %>% 
  mutate_if(is.numeric, scales::comma, accuracy = 0.1) %>% 
  head()
country continent year lifeExp pop gdpPercap
Afghanistan Asia 1,952.0 28.8 8,425,333.0 779.4
Afghanistan Asia 1,957.0 30.3 9,240,934.0 820.9
Afghanistan Asia 1,962.0 32.0 10,267,083.0 853.1
Afghanistan Asia 1,967.0 34.0 11,537,966.0 836.2
Afghanistan Asia 1,972.0 36.1 13,079,460.0 740.0
Afghanistan Asia 1,977.0 38.4 14,880,372.0 786.1

\(\color{dodgerblue}{\textbf{Summarizing}}\)

summarize() groups and returns a single summary statistic

gapminder %>% 
  filter(year == 2007) %>% 
  group_by(continent) %>% 
  summarize(count = n(), 
            missing = sum(is.na(lifeExp)),
            unique_countries = n_distinct(country),
            low_lifeExp = min(lifeExp, na.rm = TRUE),
            high_lifeExp = max(lifeExp, na.rm = TRUE),
            mean_lifeExp = mean(lifeExp, na.rm = TRUE),
            median_lifeEx = median(lifeExp, na.rm = TRUE),
            above_world_mean = sum(lifeExp > mean(gapminder %>% 
                                                    filter(year == 2007) %>% 
                                                    .$lifeExp,
                                                  na.rm = TRUE)),
            share_above_world_mean = above_world_mean / count) %>% 
  ungroup()
continent count missing unique_countries low_lifeExp high_lifeExp mean_lifeExp median_lifeEx above_world_mean share_above_world_mean
Africa 52 0 52 39.613 76.442 54.80604 52.9265 7 0.1346154
Americas 25 0 25 60.916 80.653 73.60812 72.8990 23 0.9200000
Asia 33 0 33 43.828 82.603 70.72848 72.3960 23 0.6969697
Europe 30 0 30 71.777 81.757 77.64860 78.6085 30 1.0000000
Oceania 2 0 2 80.204 81.235 80.71950 80.7195 2 1.0000000

\(\color{dodgerblue}{\textbf{Strings}}\)

\(\color{skyblue}{\textrm{- Trimming Spaces}}\)

str_trim() cuts out spaces at the beginning and end of the string

str_trim("  this is a test     ")
## [1] "this is a test"


\(\color{skyblue}{\textrm{- Padding}}\)

str_pad() adds a number (width) of characters (pad) at the beginning (left) or end (right) of a string

county_codes <- c("1", "51", "201")

str_pad(county_codes, 
        width = 3, 
        side = "left", 
        pad = "0")
## [1] "001" "051" "201"


\(\color{skyblue}{\textrm{- Extract Number}}\)

parse_number() drops all non-numeric characters

parse_number("$1,234,567")
## [1] 1234567


\(\color{skyblue}{\textrm{- Replacing}}\)

str_replace(string, old, new) swaps out old for new characters

#Replace First
str_replace("bananas","a","o")
## [1] "bonanas"
#Replace All
str_replace_all("bananas","a","o")
## [1] "bononos"


str_remove(string, old) just drops the old characters

#Remove  First
str_remove("bananas","a")
## [1] "bnanas"
#Remove All
str_remove_all("bananas","a")
## [1] "bnns"


\(\color{skyblue}{\textrm{- Capitalization}}\)

str has some capitalization conventions (though setting all to lower is best for matching)

text <- "nEw YoRk CiTy"

#All Lower
str_to_lower(text)
## [1] "new york city"
#All Upper
str_to_upper(text)
## [1] "NEW YORK CITY"
#First Word Capitalized
str_to_sentence(text)
## [1] "New york city"
#All Words Capitalized
str_to_title(text)
## [1] "New York City"

\(\color{dodgerblue}{\textbf{Dates}}\)

library(lubridate)


A date-time is a point on the timeline stored as the number of sections since 1970-01-01 00:00:00 UTC

as_datetime(1511870400) 
## [1] "2017-11-28 12:00:00 UTC"


A date is a point on the timeline stored as the number of days since 1970-01-01

as_date(17498)
## [1] "2017-11-28"


A time is a point on the timeline stored as the number of seconds since 00:00:00

hms::as_hms(10230)
## 02:50:30


\(\color{skyblue}{\textrm{- Current}}\)

now() returns the date-time

now()
## [1] "2022-02-10 17:46:04 EST"


today() returns the date

today()
## [1] "2022-02-10"


\(\color{skyblue}{\textrm{- Parsing}}\)

The functions match the order of the components. For example, 08/07/06 could be:

  • Aug 7, 2006
  • Jul 8, 2006
  • Jul 6, 2008
  • Jun 7, 2008
mdy("08/07/06")
## [1] "2006-08-07"
dmy("08/07/06")
## [1] "2006-07-08"
ymd("08/07/06")
## [1] "2008-07-06"
ydm("08/07/06")
## [1] "2008-06-07"


It also works for more complicated renderings like “The 4th of July, 2018”

dmy("The 4th of July, 2018")
## [1] "2018-07-04"


And for time

mdy_hms("08/07/06 10:09:30")
## [1] "2006-08-07 10:09:30 UTC"


For more complex strings, the format can be specified:

  • y = year (2021 or 21, but defaults to current century)

  • m = numerical month (1-12 or 01-12)

  • b = month name (January or Jan)

  • d = day of the month (1-31 or 01-31)

  • w = numerical day of the week (0-6 with Sunday as 0)

  • a = day of the week name (Monday or Mon)

  • _ = split between date and time

  • H = 24 hour (0-24 or 00-24)

  • I = am/pm hour (1-12 or 01-12)

  • p = am or pm when using I

  • M = minute (0-59 or 00-59)

  • s = second (0-61 or 00-61)

parse_date_time("Monday June 1st 2010 at 4pm", orders = "amdy_Ip")
## [1] "2010-06-01 16:00:00 UTC"


\(\color{skyblue}{\textrm{- Time Zones}}\)

tz() checks the timezone

tz(ymd_hms("2017-03-11 12:00:00"))
## [1] "UTC"


tz = sets the timezone

ymd_hms("2017-03-11 12:00:00", tz = "America/Los_Angeles")
## [1] "2017-03-11 12:00:00 PST"


force_tz() changes an incorrect timezone (without changing the time value)

force_tz(ymd_hms("2017-03-11 12:00:00"), tzone = "America/New_York")
## [1] "2017-03-11 12:00:00 EST"


with_tz converts to a new timezone (thus changing the time value)

with_tz(ymd_hms("2017-03-11 12:00:00"), tzone = "America/New_York")
## [1] "2017-03-11 07:00:00 EST"


\(\color{skyblue}{\textrm{- Extracting Elements}}\)

  • date() = drops the time

  • year() = year

  • month() = month

  • day() = number day of the month

  • yday() = number day of the year (1-365)

  • wday() = day of the week

  • hour() = hour

  • min() = minute

  • leap_year() = true/false for leap year

  • dst() = true/false for daylight savings

  • quarter() = quarter of year

  • semester() = half of year

  • label = TRUE for word instead of number

  • abbr = TRUE for abbreviated word

str_c("Today is day ", day(today()),
      " of the month of ", month(today(), label = TRUE, abbr = FALSE),
      ", which is a ", wday(today(), label = TRUE, abbr = FALSE),
      ". It is also day ", yday(today()), 
      " of ", year(today()),
      ".") 
## [1] "Today is day 10 of the month of February, which is a Thursday. It is also day 41 of 2022."


\(\color{skyblue}{\textrm{- Rounding}}\)

round_date() gives the nearest unit

round_date(ymd("2021-06-30"), unit = "month")
## [1] "2021-07-01"


ceiling_date() rounds up

ceiling_date(ymd_hm("2021-06-30 12:50"), unit = "15 minutes")
## [1] "2021-06-30 13:00:00 UTC"


floor_date() rounds down

floor_date(ymd_hm("2021-06-30 12:50"), unit = "hour")
## [1] "2021-06-30 12:00:00 UTC"


\(\color{skyblue}{\textrm{- Adding Time}}\)

Time does not behave like general number lines. To evaluate, you must ask which is more important: the datetime in the real world or the length of time (duration)?

  • seconds() / dseconds()
  • minutes() / dminutes()
  • hours() / dhours()
  • days() / ddays()
  • weeks() / dweeks()
  • months() / dmonths()
  • years() / dyears()
#Normal Day
normal_day <- ymd_hm("2018-01-01 24:00", tz = "America/New_York")

period <- normal_day + hours(3)

duration <- normal_day + dhours(3)

tibble(Difference = difftime(period, duration, units = "hours"),
       Period = period,
       Duration = duration)
Difference Period Duration
0 hours 2018-01-02 03:00:00 2018-01-02 03:00:00


#Daylight Savings
daylight_savings_start <- ymd_hm("2018-03-10 24:00", tz = "America/New_York")

period <- daylight_savings_start + hours(3)

duration <- daylight_savings_start + dhours(3)

tibble(Difference = difftime(period, duration, units = "hours"),
       Period = period,
       Duration = duration)
Difference Period Duration
-1 hours 2018-03-11 03:00:00 2018-03-11 04:00:00


#Exit Daylight Savings
daylight_savings_end <- ymd_hm("2018-11-03 24:00", tz = "America/New_York")

period <- daylight_savings_end + hours(3)

duration <- daylight_savings_end + dhours(3)

tibble(Difference = difftime(period, duration, units = "hours"),
       Period = period,
       Duration = duration)
Difference Period Duration
1 hours 2018-11-04 03:00:00 2018-11-04 02:00:00


#Normal Year
normal_year <- ymd("2018-09-20", tz = "America/New_York")

period <- normal_year + years(1)

duration <- normal_year + years(1)

tibble(Difference = difftime(period, duration, units = "days"),
       Period = period,
       Duration = duration)
Difference Period Duration
0 days 2019-09-20 2019-09-20


#Leap Year
leap_year <- ymd("2019-09-20", tz = "America/New_York")

period <- leap_year + years(1)

duration <- leap_year + dyears(1)

tibble(Difference = difftime(period, duration, units = "days"),
       Period = period,
       Duration = duration)
Difference Period Duration
0.75 days 2020-09-20 2020-09-19 06:00:00


For months, imaginary dates (e.g. Feb 31) need to be dealt with:

ymd("2019-1-31") + months(1)
## [1] NA
ymd("2019-1-31") %>% add_with_rollback(months(1))
## [1] "2019-02-28"
ymd("2019-1-31") %>% add_with_rollback(months(1), roll_to_first = TRUE)
## [1] "2019-03-01"
ymd("2019-1-31") + dmonths(1)
## [1] "2019-03-02 10:30:00 UTC"


\(\color{skyblue}{\textrm{- Difference Between Times}}\)

difftime() measures the duration between two real datetimes

  • secs
  • mins
  • hours
  • days
  • weeks
#Normal Day
difftime(ymd_hm("2018-01-02 03:00", tz = "America/New_York"),
         ymd_hm("2018-01-01 24:00", tz = "America/New_York"), 
         unit = "days")
## Time difference of 0.125 days
#Enter Daylight Savings
difftime(ymd_hm("2018-03-11 03:00", tz = "America/New_York"),
         ymd_hm("2018-03-10 24:00", tz = "America/New_York"), 
         unit = "days")
## Time difference of 0.08333333 days
#Exit Daylight Savings
difftime(ymd_hm("2018-11-04 03:00", tz = "America/New_York"),
         ymd_hm("2018-11-03 24:00", tz = "America/New_York"), 
         unit = "days")
## Time difference of 0.1666667 days


\(\color{skyblue}{\textrm{- Intervals}}\)

interval() creates an interval

ryan <- interval(ymd("1990-11-04"), today())
ryan
## [1] 1990-11-04 UTC--2022-02-10 UTC


int_length() measures the length

int_length(ryan) %>% dseconds()
## [1] "986774400s (~31.27 years)"


int_shift() modifies the interval

int_shift(ryan, days(5))
## [1] 1990-11-09 UTC--2022-02-15 UTC


%within% determines if a date is within the interval

the_90s <- interval(ymd("1990-01-01"), ymd("1999-12-31"))

today() %within% the_90s
## [1] FALSE


int_overlaps() determines if two intervals overlap

the_1900s <- interval(ymd("1900-01-01"), ymd("1999-12-31"))

the_90s %within% the_1900s
## [1] TRUE
the_1900s %within% the_90s
## [1] FALSE
int_overlaps(the_90s, the_1900s)
## [1] TRUE

\(\color{dodgerblue}{\textbf{Combining Data}}\)

\(\color{skyblue}{\textrm{- Joins}}\)

  • inner_join()
  • full_join()
  • left_join()
  • right_join()
# Basic
joined_data <- data1 %>%
  inner_join(data2, by = "common_col")

# Multiple keys 
joined_data <- data1 %>%
  inner_join(data2, by = c("common_col1", "common_col2")

# Key with different name
joined_data <- data1 %>%
  inner_join(data2, by = c("col from df1" = "col from df2"))


\(\color{skyblue}{\textrm{- Filter Joins}}\)

sec_council <- tibble(country = c("United States", "United Kingdom", "France", 
                                  "Russia", "China"))


semi_join() returns data from one dataset that is present in a second dataset

gapminder %>% 
  filter(year == 2007) %>% 
  semi_join(sec_council, by = "country") 
country continent year lifeExp pop gdpPercap
China Asia 2007 72.961 1318683096 4959.115
France Europe 2007 80.657 61083916 30470.017
United Kingdom Europe 2007 79.425 60776238 33203.261
United States Americas 2007 78.242 301139947 42951.653


anti_join() returns data from one dataset that is NOT present in a second dataset

gapminder %>% 
  filter(year == 2007) %>% 
  anti_join(sec_council, by = "country") %>% 
  head()
country continent year lifeExp pop gdpPercap
Afghanistan Asia 2007 43.828 31889923 974.5803
Albania Europe 2007 76.423 3600523 5937.0295
Algeria Africa 2007 72.301 33333216 6223.3675
Angola Africa 2007 42.731 12420476 4797.2313
Argentina Americas 2007 75.320 40301927 12779.3796
Australia Oceania 2007 81.235 20434176 34435.3674


\(\color{skyblue}{\textrm{- Binding}}\)

df1 <- tibble("A" = c(1, 2, 3),
              "B" = c(6, 5, 4))

df2 <- tibble("A" = c(4, 5, 6),
              "B" = c(3, 2, 1),
              "C" = c("L", "M", "N"))

df3 <- tibble("B" = c(4, 5, 6),
              "C" = c("1", "2", "3"))


bind_rows() tacks on new rows

df1 %>% 
  bind_rows(df2)
A B C
1 6 NA
2 5 NA
3 4 NA
4 3 L
5 2 M
6 1 N


bind_cols() tacks on new columns

df1 %>% 
  bind_cols(df3)
A B…2 B…3 C
1 6 4 1
2 5 5 2
3 4 6 3


\(\color{skyblue}{\textrm{- Sets}}\)

country_z <- gapminder %>% 
  filter(year == 2007,
         str_detect(str_to_lower(country), "z"))

country_w <- gapminder %>% 
  filter(year == 2007,
         str_detect(str_to_lower(country), "w"))


union() returns all, removing duplicates

country_z %>% 
  union(country_w) %>% 
  head()
country continent year lifeExp pop gdpPercap
Bosnia and Herzegovina Europe 2007 74.852 4552198 7446.2988
Brazil Americas 2007 72.390 190010647 9065.8008
Czech Republic Europe 2007 76.486 10228744 22833.3085
Mozambique Africa 2007 42.082 19951656 823.6856
New Zealand Oceania 2007 80.204 4115771 25185.0091
Swaziland Africa 2007 39.613 1133066 4513.4806


intersect() returns only observations in both

country_z %>% 
  intersect(country_w) %>% 
  head()
country continent year lifeExp pop gdpPercap
New Zealand Oceania 2007 80.204 4115771 25185.0091
Swaziland Africa 2007 39.613 1133066 4513.4806
Switzerland Europe 2007 81.701 7554661 37506.4191
West Bank and Gaza Asia 2007 73.422 4018332 3025.3498
Zimbabwe Africa 2007 43.487 12311143 469.7093


setdiff() returns only observations in the first that are not in the second

country_z %>%
  setdiff(country_w) %>% 
  head()
country continent year lifeExp pop gdpPercap
Bosnia and Herzegovina Europe 2007 74.852 4552198 7446.2988
Brazil Americas 2007 72.390 190010647 9065.8008
Czech Republic Europe 2007 76.486 10228744 22833.3085
Mozambique Africa 2007 42.082 19951656 823.6856
Tanzania Africa 2007 52.517 38139640 1107.4822
Venezuela Americas 2007 73.747 26084662 11415.8057

\(\color{darkblue}{\textbf{Visualize}}\)


See ggplot or Spatial R

\(\color{darkblue}{\textbf{Model}}\)


See Statistics or tidymodels

\(\color{darkblue}{\textbf{Communicate}}\)


See Markdown or Shiny